0. Load Libraries and Define Functions

# 0.0  Load libraries ----
# Loads tidyquant, lubridate, xts, quantmod, TTR, and PerformanceAnalytics
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyquant)  
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.4      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.26     ✔ xts                  0.14.0── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
#0.1 Define functions ----
#The following functions explicitly compute annual, monthly and weekly returns
get_annual_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "yearly")
}

get_monthly_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "monthly")
}

get_weekly_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "weekly")
}

get_daily_returns <- function(stock.returns) {
  stock.returns %>%
    tq_transmute(select     = adjusted, 
                 mutate_fun = periodReturn, 
                 type       = "log", 
                 period     = "daily")
}

1. Collect stock price data from yahoo —-

1.1 Get data on all stocks in the SP00 —-

stocks_tbl <- tq_index("SP500")
## Getting holdings for SP500
stocks_tbl
## # A tibble: 505 × 8
##    symbol company      identifier sedol weight sector shares_held local_currency
##    <chr>  <chr>        <chr>      <chr>  <dbl> <chr>        <dbl> <chr>         
##  1 AAPL   APPLE INC    037833100  2046… 0.0692 -        171767010 USD           
##  2 MSFT   MICROSOFT C… 594918104  2588… 0.0655 -         88569779 USD           
##  3 NVDA   NVIDIA CORP  67066G104  2379… 0.0569 -        293126746 USD           
##  4 AMZN   AMAZON.COM … 023135106  2000… 0.0349 -        109092981 USD           
##  5 META   META PLATFO… 30303M102  B7TL… 0.0240 -         26127829 USD           
##  6 GOOGL  ALPHABET IN… 02079K305  BYVY… 0.0190 -         70032798 USD           
##  7 BRK-B  BERKSHIRE H… 084670702  2073… 0.0181 -         21584443 USD           
##  8 GOOG   ALPHABET IN… 02079K107  BYY8… 0.0159 -         58262441 USD           
##  9 LLY    ELI LILLY +… 532457108  2516… 0.0157 -          9507966 USD           
## 10 JPM    JPMORGAN CH… 46625H100  2190… 0.0135 -         34237071 USD           
## # ℹ 495 more rows
# # Symbol "BRK.B" gives error from tq_get  
# # Change to "BRK-B"
# rowindex_0<-which(stocks_tbl$symbol=="BRK.B")
# stocks_tbl$symbol[rowindex_0]<-"BRK-B"
# 
# # Change to "BF.B" to "BF-B"
# rowindex_0<-which(stocks_tbl$symbol=="BF.B")
# stocks_tbl$symbol[rowindex_0]<-"BF-B"
# 
# # Drop  symbol "-"
index_dash<-which(stocks_tbl$symbol=="-")  
stocks_tbl0<-stocks_tbl[-c( index_dash),]  

# Collect all symbols in stocks_tbl0

stocks_symbols<-stocks_tbl0$symbol

1.2 Set start and end dates for price data —-

date_start= "2019-01-01"
date_end= "2024-08-31"


# Collecting data from large group can take
# significant amount of time
#     (on 9/12/2023, less than 3 minutes)
# To monitor progress we split large group into subgroups

1.3 Split large group of stocks into subgroups —-

stocks_tbl0$group<-floor((c(1:nrow(stocks_tbl0))-1)/50)


list_group<-unique(stocks_tbl0$group)

#for (group0 in list_group[-c(1:4)]){
  
for (group0 in list_group) {
  stocks_data <- stocks_tbl0 %>%
    filter(group==group0) %>%
    select(symbol, company) %>%
    tq_get(from = date_start, to=date_end)
assign(paste("stocks_data",group0,sep="."),
       stocks_data)
print(c("End for group ", as.character(group0)))
}
## [1] "End for group " "0"             
## [1] "End for group " "1"             
## [1] "End for group " "2"             
## [1] "End for group " "3"             
## [1] "End for group " "4"             
## [1] "End for group " "5"             
## [1] "End for group " "6"             
## [1] "End for group " "7"             
## [1] "End for group " "8"             
## [1] "End for group " "9"             
## [1] "End for group " "10"
# If any warnings occur, determine which symbol generated message
# Revise symbol name (as above, replacing "." by "-", or delete
# as was case for symbol = "-", which corresponded to US dollar).

1.4 Bind together data from all groups

stocks_data<-rbind(
  stocks_data.0,
  stocks_data.1,
  stocks_data.2,
  stocks_data.3,
  stocks_data.4,
  stocks_data.5,
  stocks_data.6,
  stocks_data.7,
  stocks_data.8,
  stocks_data.9,
  stocks_data.10)
dim(stocks_data)
## [1] 707653      9

1.5 Convert stocks_data to time series matrix

stocks_data1 <- stocks_data %>% 
  dplyr::select(symbol, date, adjusted) %>% 
  dplyr::filter(!is.na(adjusted)) %>% 
  dplyr::rename(price = adjusted)
  

stocks_tsmatrix<- stocks_data1 %>%
  pivot_wider(names_from = symbol, values_from = price)

dim(stocks_tsmatrix)
## [1] 1426  504
# On 9/10/2024
# dim(stocks_tsmatrix)
# [1] 1426  504
names(stocks_tsmatrix)
##   [1] "date"  "AAPL"  "MSFT"  "NVDA"  "AMZN"  "META"  "GOOGL" "BRK-B" "GOOG" 
##  [10] "LLY"   "JPM"   "AVGO"  "TSLA"  "UNH"   "XOM"   "V"     "PG"    "JNJ"  
##  [19] "MA"    "COST"  "HD"    "ABBV"  "WMT"   "MRK"   "NFLX"  "KO"    "BAC"  
##  [28] "ADBE"  "PEP"   "CVX"   "CRM"   "TMO"   "AMD"   "ORCL"  "LIN"   "ACN"  
##  [37] "MCD"   "ABT"   "PM"    "CSCO"  "WFC"   "IBM"   "TXN"   "GE"    "QCOM" 
##  [46] "VZ"    "DHR"   "INTU"  "NOW"   "AMGN"  "ISRG"  "NEE"   "PFE"   "SPGI" 
##  [55] "CAT"   "DIS"   "RTX"   "GS"    "CMCSA" "T"     "UNP"   "AMAT"  "PGR"  
##  [64] "UBER"  "AXP"   "LOW"   "TJX"   "HON"   "BKNG"  "ELV"   "COP"   "LMT"  
##  [73] "MS"    "BLK"   "SYK"   "VRTX"  "BSX"   "REGN"  "MDT"   "PLD"   "CB"   
##  [82] "ETN"   "MMC"   "C"     "ADP"   "AMT"   "PANW"  "ADI"   "SBUX"  "MDLZ" 
##  [91] "CI"    "FI"    "TMUS"  "DE"    "BX"    "BMY"   "GILD"  "SO"    "NKE"  
## [100] "KLAC"  "LRCX"  "MU"    "SCHW"  "BA"    "UPS"   "MO"    "ICE"   "DUK"  
## [109] "CL"    "ZTS"   "SHW"   "ANET"  "INTC"  "EQIX"  "KKR"   "CME"   "TT"   
## [118] "WM"    "AON"   "WELL"  "MCO"   "HCA"   "PH"    "CMG"   "NOC"   "MSI"  
## [127] "PNC"   "PYPL"  "APH"   "TDG"   "CVS"   "MMM"   "SNPS"  "USB"   "CTAS" 
## [136] "TGT"   "EOG"   "CDNS"  "BDX"   "GD"    "ITW"   "ORLY"  "MCK"   "CSX"  
## [145] "AJG"   "FDX"   "ECL"   "APD"   "CARR"  "NXPI"  "ROP"   "NEM"   "NSC"  
## [154] "FCX"   "SLB"   "MPC"   "CRWD"  "EMR"   "TFC"   "AFL"   "DHI"   "PSA"  
## [163] "CEG"   "GEV"   "TRV"   "MAR"   "O"     "ADSK"  "AEP"   "COF"   "PSX"  
## [172] "WMB"   "GM"    "AZO"   "OKE"   "HLT"   "SPG"   "SRE"   "ABNB"  "CCI"  
## [181] "ROST"  "BK"    "KMB"   "PCAR"  "ALL"   "AIG"   "DLR"   "D"     "FTNT" 
## [190] "URI"   "FIS"   "JCI"   "MET"   "LEN"   "KVUE"  "TEL"   "MSCI"  "IQV"  
## [199] "FICO"  "VLO"   "AMP"   "LHX"   "CPRT"  "GWW"   "GIS"   "PAYX"  "PCG"  
## [208] "RSG"   "F"     "PRU"   "ACGL"  "HUM"   "KMI"   "MCHP"  "STZ"   "CMI"  
## [217] "A"     "PEG"   "MPWR"  "IDXX"  "EW"    "COR"   "CTVA"  "SYY"   "VRSK" 
## [226] "FAST"  "KDP"   "EXC"   "IT"    "CTSH"  "AME"   "RCL"   "CNC"   "YUM"  
## [235] "OTIS"  "EXR"   "HWM"   "RMD"   "EFX"   "PWR"   "ED"    "GEHC"  "MNST" 
## [244] "DOW"   "HES"   "IR"    "EA"    "HIG"   "VICI"  "XEL"   "CBRE"  "OXY"  
## [253] "KR"    "ODFL"  "BKR"   "NUE"   "DFS"   "DD"    "EIX"   "CSGP"  "IRM"  
## [262] "FANG"  "TRGP"  "CHTR"  "AVB"   "GLW"   "MLM"   "XYL"   "VMC"   "WTW"  
## [271] "EBAY"  "WEC"   "MTD"   "ON"    "ROK"   "HPQ"   "PPG"   "NDAQ"  "TSCO" 
## [280] "ADM"   "HSY"   "NVR"   "BIIB"  "CDW"   "KHC"   "LULU"  "FITB"  "AWK"  
## [289] "DAL"   "GPN"   "GRMN"  "MTB"   "WAB"   "PHM"   "DXCM"  "CAH"   "ANSS" 
## [298] "DVN"   "IFF"   "ETR"   "SBAC"  "CHD"   "VTR"   "DTE"   "AXON"  "EQR"  
## [307] "HAL"   "KEYS"  "FTV"   "MRNA"  "STT"   "DOV"   "BR"    "TTWO"  "TYL"  
## [316] "BRO"   "VST"   "STE"   "LYB"   "VLTO"  "ES"    "NTAP"  "PPL"   "TROW" 
## [325] "FE"    "DECK"  "HPE"   "SW"    "WST"   "CBOE"  "WY"    "RJF"   "FSLR" 
## [334] "AEE"   "ZBH"   "CPAY"  "CINF"  "GDDY"  "COO"   "K"     "HBAN"  "RF"   
## [343] "LDOS"  "MKC"   "SMCI"  "INVH"  "CLX"   "BLDR"  "HUBB"  "CMS"   "EL"   
## [352] "BALL"  "WDC"   "PTC"   "TDY"   "ATO"   "BAX"   "SYF"   "WAT"   "STX"  
## [361] "OMC"   "HOLX"  "CFG"   "ESS"   "LH"    "GPC"   "TER"   "BBY"   "EQT"  
## [370] "DRI"   "MOH"   "TSN"   "APTV"  "ULTA"  "MAA"   "ARE"   "PKG"   "J"    
## [379] "NTRS"  "WRB"   "DG"    "AVY"   "LUV"   "PFG"   "CNP"   "EXPD"  "DGX"  
## [388] "CTRA"  "TXT"   "MAS"   "EXPE"  "ZBRA"  "EG"    "IP"    "STLD"  "FDS"  
## [397] "NRG"   "WBD"   "VRSN"  "CCL"   "AMCR"  "UAL"   "SWKS"  "ALGN"  "DOC"  
## [406] "CAG"   "KIM"   "PODD"  "KEY"   "MRO"   "NI"    "LNT"   "IEX"   "SWK"  
## [415] "LVS"   "L"     "SNA"   "DPZ"   "RVTY"  "GEN"   "BG"    "AKAM"  "CF"   
## [424] "PNR"   "ENPH"  "DLTR"  "JBHT"  "ROL"   "EVRG"  "UHS"   "UDR"   "TRMB" 
## [433] "LYV"   "POOL"  "VTRS"  "NDSN"  "CPT"   "KMX"   "JKHY"  "JBL"   "SJM"  
## [442] "REG"   "JNPR"  "CE"    "IPG"   "FFIV"  "HST"   "ALLE"  "EPAM"  "CHRW" 
## [451] "EMN"   "TFX"   "TECH"  "AES"   "CTLT"  "LKQ"   "HII"   "BXP"   "TAP"  
## [460] "QRVO"  "CPB"   "AIZ"   "NWSA"  "PNW"   "MTCH"  "FOXA"  "MKTX"  "CRL"  
## [469] "AOS"   "TPR"   "HRL"   "SOLV"  "ALB"   "INCY"  "LW"    "HSIC"  "APA"  
## [478] "GL"    "MGM"   "GNRC"  "DAY"   "HAS"   "FRT"   "BF-B"  "MOS"   "DVA"  
## [487] "PAYC"  "FMC"   "BWA"   "MHK"   "CZR"   "WYNN"  "AAL"   "NCLH"  "IVZ"  
## [496] "RL"    "BIO"   "WBA"   "BBWI"  "ETSY"  "BEN"   "PARA"  "FOX"   "NWS"

1.6 Subset out stocks with no missing data

missingcount_bydate<-apply(is.na(stocks_tsmatrix),1,sum)
plot(x=stocks_tsmatrix$date, y=missingcount_bydate,type="l")

# On 9/12/2022, 11 stocks had some missing values on whole period from date_start to date_end

# Define stocks_tsmatrix2 to be  stocks with no missing values

missingcount_bystock<-apply(is.na(stocks_tsmatrix),2,sum)
plot(x=c(1:ncol(stocks_tsmatrix)),
          y=missingcount_bystock,type="l")

sum(missingcount_bystock==0)
## [1] 488
# 488 stocks had no missing prices on period

which_cols_nomissing<-which(missingcount_bystock==0)

# Extract data for stocks with no missing prices ----
dim(stocks_tsmatrix)
## [1] 1426  504
stocks_tsmatrix0<-stocks_tsmatrix[,which_cols_nomissing]

1.7 Create stocks_tbl0 corresponding to symbols in stocks_tsmatrix

# names(stocks_tsmatrix0)
dim(stocks_tsmatrix0) # columns equal to 1(date) + (number of stocks)
## [1] 1426  488
which_symbols_tsmatrix0<-match(names(stocks_tsmatrix0)[-1],
                               stocks_tbl0$symbol,nomatch=0)
stocks_tbl0_tsmatrix0<- stocks_tbl0[which_symbols_tsmatrix0,]

dim(stocks_tsmatrix0)
## [1] 1426  488

2. Save Workspaces

2.1 Save R workspace SP000_data_all.RData —-

save(file="SP500_data_all.RData", list=ls())

2.2 Save R workspace SP000_data_subset.RData —-

stocks_prices<-stocks_tsmatrix0

stocks_tbl<-stocks_tbl0_tsmatrix0     


save(file="SP500_data_subset.RData", list=c("stocks_tbl","stocks_prices"))